YouTube Views with XGBoost
Predicting views using XGBoost and data from YouTube and Google APIs

I wanted to find the features that help me get more views on YouTube. Successfully identified which YouTube videos are successful with 85% accuracy. I used the YouTube API to gather data from my competitors on YouTube. I also used Google Vision's deep learning models to analyzye thumbnail images, and add that data to the YouTube API data. Data from nearly 2,000 videos was used to train and test an XGBoost model.
- toc: true
- badges: true
- comments: true
- categories: [XGBoost,YouTube,API,Google Vision]
- image: images/youtube.jpg
1. Objective
We want to classify videos as succesful(high views per subscriber) or unsuccessful by scraping data from YouTube and using the data to train an XGBoost model. The main sources of data for this project are the APIs for YouTube and GoogleVision. From these two sources we gather information about each of my competitor's YouTube channels: views, subscribers, number of videos, titles of the videos, duration of each video, etc. From the YouTube API I also extract the thubnail URL for each video. Later I explain how the Google Vision API analyzes the image of the thumbnail for facial and text recognition. I combine the youtube data and the thumbnail data together for the analysis.
import os
os.chdir("C:\\Users\\merre\\Desktop\\envs\\yt_api_env\\Lib\\site-packages")
from numpy.core.fromnumeric import shape
from numpy.lib.function_base import diff
import pandas as pd
import re
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
from pandas.core.indexes.base import Index
import emoji
from scipy import stats
import fasttext
from collections import Counter
from pprint import pprint
from IPython.display import display_html
from itertools import chain,cycle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
import shap
import xgboost as xgb
import matplotlib.pylab as pl
from sklearn.metrics import confusion_matrix, accuracy_score
import warnings
import altair as alt
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
YouTube API Data Pull
import json
import requests
from numpy import random
from time import sleep
import os
class YTstats:
def __init__(self, api_key, channel_id):
self.api_key = api_key
self.channel_id = channel_id
self.channel_statistics = None
self.video_data = None
def extract_all(self):
self.get_channel_statistics()
self.get_channel_video_data()
def get_channel_statistics(self):
"""Extract the channel statistics"""
print('get channel statistics...')
url = f'https://www.googleapis.com/youtube/v3/channels?part=statistics&id={self.channel_id}&key={self.api_key}'
json_url = requests.get(url)
data = json.loads(json_url.text)
try:
data = data['items'][0]['statistics']
except KeyError:
print('Could not get channel statistics')
data = {}
self.channel_statistics = data
return data
def get_channel_video_data(self):
global s
s = requests.Session()
"Extract all video information of the channel"
print('get video data...')
channel_videos, channel_playlists = self._get_channel_content(limit=50)
parts=["snippet", "statistics","contentDetails", "topicDetails"]
print(len(channel_videos))
ii = 0
for video_id in channel_videos:
ii += 1
print(ii/len(channel_videos))
for part in parts:
data = self._get_single_video_data(video_id, part)
channel_videos[video_id].update(data)
self.video_data = channel_videos
return channel_videos
def _get_single_video_data(self, video_id, part):
"""
Extract further information for a single video
parts can be: 'snippet', 'statistics', 'contentDetails', 'topicDetails'
"""
sleep(random.uniform(1, 3)/2)
url = f"https://www.googleapis.com/youtube/v3/videos?part={part}&id={video_id}&key={self.api_key}"
json_url = s.get(url)
data = json.loads(json_url.text)
print(url)
try:
data = data['items'][0][part]
except KeyError as e:
print(f'Error! Could not get {part} part of data: \n{data}')
data = dict()
return data
def _get_channel_content(self, limit=None, check_all_pages=True):
"""
Extract all videos and playlists, can check all available search pages
channel_videos = videoId: title, publishedAt
channel_playlists = playlistId: title, publishedAt
return channel_videos, channel_playlists
"""
url = f"https://www.googleapis.com/youtube/v3/search?key={self.api_key}&channelId={self.channel_id}&part=snippet,id&order=date"
if limit is not None and isinstance(limit, int):
url += "&maxResults=" + str(limit)
vid, pl, npt = self._get_channel_content_per_page(url)
idx = 0
while(check_all_pages and npt is not None and idx < 50 and num_pages < 2):
nexturl = url + "&pageToken=" + npt
next_vid, next_pl, npt = self._get_channel_content_per_page(nexturl)
vid.update(next_vid)
pl.update(next_pl)
idx += 1
print(check_all_pages, idx, npt)
return vid, pl
def _get_channel_content_per_page(self, url):
"""
Extract all videos and playlists per page
return channel_videos, channel_playlists, nextPageToken
"""
sleep(random.uniform(1, 3))
json_url = requests.get(url)
data = json.loads(json_url.text)
channel_videos = dict()
channel_playlists = dict()
if 'items' not in data:
print('Error! Could not get correct channel data!\n', data)
return channel_videos, channel_playlists, None
global num_pages
num_pages+=1
nextPageToken = data.get("nextPageToken", None)
item_data = data['items']
for item in item_data:
try:
kind = item['id']['kind']
published_at = item['snippet']['publishedAt']
title = item['snippet']['title']
if kind == 'youtube#video':
video_id = item['id']['videoId']
channel_videos[video_id] = {'publishedAt': published_at, 'title': title}
elif kind == 'youtube#playlist':
playlist_id = item['id']['playlistId']
channel_playlists[playlist_id] = {'publishedAt': published_at, 'title': title}
except KeyError as e:
print('Error! Could not extract data from item:\n', item)
return channel_videos, channel_playlists, nextPageToken
def dump(self):
"""Dumps channel statistics and video data in a single json file"""
if self.channel_statistics is None or self.video_data is None:
print('data is missing!\nCall get_channel_statistics() and get_channel_video_data() first!')
return
fused_data = {self.channel_id: {"channel_statistics": self.channel_statistics,
"video_data": self.video_data}}
channel_title = self.video_data.popitem()[1].get('channelTitle', self.channel_id)
channel_title = channel_title.replace(" ", "_").lower()
filename = channel_title + '.json'
with open(filename, 'w') as f:
json.dump(fused_data, f, indent=4)
print('file dumped to', filename)
#brian hull UCiNeUoUWfBLC8mJuMzI6hvw
#Black Gryphon UCvzWGXYFDiiJ338KIJPhbhQ
#Brock Baker UCLzdMXE3R2xXIklfIO9HCcQ
# Ori UCra3g9Qvmgux0NyY2Pdj4Lw
# Scheiffer Bates UCcBacTJIf67LSU_-yeJwDvg
#Azerrz UCiwIAU4SNlrcv47504JrJeQ
#Danny padilla & mason sperling UCfhK8MfxO-9RCypkLDyW1rw
# Brizzy UC7lObFRyZgoZcMYHHqxi9lg
# Redfireball UC88CnZTYFz5ugp-JtDEQ3-g
# Sounds like pizza UCh6OfzCefcCGFfihPbe_Y4g
#joshiiwuh UCxRGk49YNiW3Cq8s7MGknqw
# simau UCkXvCWJjAqNcFwxF7hW_ZRQ
#Knep UCy7gv-FM-dMvw6dMtj8Qfgg
# charlie hopkinson UCewLMcro9tNP97XQ1rxtLXQ
#Uss JA doin UCqPYUMNbVeEhyTBIZCDO_VQ
# Shanieology UCR93YdwZ4UKEUwf1gA-ZusA
# BigShade UC7Wt6Nukmt83Bph3us5s5Aw
# Best in Class UClQhFMEVUxJAwMW-KdZ0SvQ
# Daniel Ferguson UCXFzOJmXVaP1tMLiww4aQzg
# Mikey Boltz UC0gXT2T6KtmV0IHNNNvruAQ
# Maxamili UC-0WjH-efG2qvNlZUBlX70Q
api_key= os.environ.get('YT_API')
# channel_ids= ['UCiNeUoUWfBLC8mJuMzI6hvw','UCvzWGXYFDiiJ338KIJPhbhQ','UCLzdMXE3R2xXIklfIO9HCcQ','UCra3g9Qvmgux0NyY2Pdj4Lw','UCcBacTJIf67LSU_-yeJwDvg',
# 'UCiwIAU4SNlrcv47504JrJeQ','UCfhK8MfxO-9RCypkLDyW1rw','UC7lObFRyZgoZcMYHHqxi9lg','UC88CnZTYFz5ugp-JtDEQ3-g','UCh6OfzCefcCGFfihPbe_Y4g',
# 'UCxRGk49YNiW3Cq8s7MGknqw','UCkXvCWJjAqNcFwxF7hW_ZRQ','UCy7gv-FM-dMvw6dMtj8Qfgg','UCewLMcro9tNP97XQ1rxtLXQ','UCqPYUMNbVeEhyTBIZCDO_VQ',
# 'UCR93YdwZ4UKEUwf1gA-ZusA','UC7Wt6Nukmt83Bph3us5s5Aw','UClQhFMEVUxJAwMW-KdZ0SvQ','UCXFzOJmXVaP1tMLiww4aQzg','UC0gXT2T6KtmV0IHNNNvruAQ',
# 'UC-0WjH-efG2qvNlZUBlX70Q']
channel_ids= ['UC-0WjH-efG2qvNlZUBlX70Q','UClQhFMEVUxJAwMW-KdZ0SvQ']
for channel_id in channel_ids:
global num_pages
num_pages = 0
yt = YTstats(api_key,channel_id)
yt.get_channel_statistics()
yt.get_channel_video_data()
yt.dump()
Convert JSON to Pandas
import json
from os import replace
import pandas as pd
import re
from datetime import datetime, timedelta
import cv2
import urllib
import numpy as np
from skimage import io
import matplotlib.pyplot as plt
#C:/Users/merre/Desktop/data projects/
files= ["shanieology.json","simau.json","soundslikepizza.json","azerrz.json","BigShade.json","black_gryph0n.json"
,"brian_hull.json","brizzy_voices.json","brock_baker.json","charlie_hopkinson.json","danny_padilla_&_mason_sperling.json"
,"ja_doin_stuff.json","joshiiwuh.json","knep.json","ori.json","redfireball555.json","scheiffer_bates.json","daniel_ferguson.json",
"BigShade.json","best_in_class.json","maxamili.json","mikey_bolts.json"]
data=None
df_channel_new=None
df_channel = None
for file in files:
with open(file,'r') as f:
data = json.load(f)
channel_id, stats = data.popitem()
channel_stats=stats["channel_statistics"]
video_stats = stats["video_data"]
channel_views= channel_stats["viewCount"]
channel_subs= channel_stats["subscriberCount"]
channel_videos= channel_stats["videoCount"]
try:
sorted_vids = sorted(video_stats.items(), key=lambda item: int(item[1]["viewCount"]), reverse=True)
except:
sorted_vids = video_stats.items()
stats = []
for vid in sorted_vids:
video_id = vid[0]
title = vid[1]["title"]
title_len = len(title)
title_words = re.findall(r'\w+',title)
words=0
upper_words=0
for word in title_words:
words += 1
if word.isupper():
upper_words += 1
upper_pct = upper_words/words
emoji_count = len(re.findall(u'[\U0001f600-\U0001f650]', title))
#Convert time to Mexico City Time
upload_date_time = datetime.strptime(vid[1]["publishedAt"],'%Y-%m-%dT%H:%M:%SZ')-timedelta(hours=5)
upload_date = upload_date_time.date()
upload_time = upload_date_time.time()
#0 is Monday, 6 is Sunday
upload_day = upload_date.weekday()
if datetime.strptime('04:00:00', '%H:%M:%S').time() <= upload_time <= datetime.strptime('10:30:00', '%H:%M:%S').time():
upload_time_of_day = 'morning'
elif datetime.strptime('10:30:01', '%H:%M:%S').time() <= upload_time <= datetime.strptime('18:00:00', '%H:%M:%S').time():
upload_time_of_day = 'afternoon'
elif datetime.strptime('18:00:01', '%H:%M:%S').time() <= upload_time <= datetime.strptime('23:00:00', '%H:%M:%S').time():
upload_time_of_day = 'night'
else:
upload_time_of_day = "late_night"
try:
thumbnail_url = vid[1]["thumbnails"]["maxres"]["url"]
thumbnail_h = vid[1]["thumbnails"]["maxres"]["height"]
thumbnail_w = vid[1]["thumbnails"]["maxres"]["width"]
except:
try:
thumbnail_url = vid[1]["thumbnails"]["high"]["url"]
thumbnail_h = vid[1]["thumbnails"]["high"]["height"]
thumbnail_w = vid[1]["thumbnails"]["high"]["width"]
except:
try:
thumbnail_url = vid[1]["thumbnails"]["default"]["url"]
thumbnail_h = vid[1]["thumbnails"]["default"]["height"]
thumbnail_w = vid[1]["thumbnails"]["default"]["width"]
except:
thumbnail_url=None
thumbnail_h=None
thumbnail_w=None
try:
channel = vid[1]["channelTitle"]
except:
channel=None
try:
tags = vid[1]["tags"]
except:
tag = None
num_tags = len(tags)
try:
categoryId = vid[1]["categoryId"]
except:
categoryId=None
try:
liveBroadcastContent = vid[1]["liveBroadcastContent"]
except:
liveBroadcastContent = None
try:
defaultAudioLanguage = vid[1]["defaultAudioLanguage"]
except:
defaultAudioLanguage = None
try:
viewCount = vid[1]["viewCount"]
except:
viewCount = None
try:
likeCount = vid[1]["likeCount"]
except:
likeCount =None
try:
dislikeCount = vid[1]["dislikeCount"]
except:
dislikeCount=None
try:
favoriteCount = vid[1]["favoriteCount"]
except:
favoriteCount = None
try:
commentCount = vid[1]["commentCount"]
except:
commentCount=None
try:
duration0 = vid[1]["duration"]
except:
duration0=None
try:
hours = int(re.findall(r'\d+H',duration0)[0].replace('H',''))
except:
hours = None
try:
mins = int(re.findall(r'\d+M',duration0)[0].replace('M',''))
except:
mins=None
try:
secs = int(re.findall(r'\d+S',duration0)[0].replace('S',''))
except:
secs=0
if hours is not None and mins is not None and secs is not None:
duration = hours*60 + mins + secs/60
elif mins is not None and secs is not None:
duration = mins + secs/60
elif secs is not None:
duration = secs/60
try:
definition = vid[1]["definition"]
except:
definition =None
try:
captions = vid[1]["caption"]
except:
captions = None
try:
licensedContent = vid[1]["licensedContent"]
except:
licensedContent=None
try:
projection = vid[1]["projection"]
except:
projection = None
try:
topicCategories = vid[1]["topicCategories"]
except:
topicCategories = None
try:
desc = vid[1]["description"]
except:
desc = None
video_id = vid[0]
stats.append([video_id,title,title_len,words,upper_pct,emoji_count,upload_date,upload_time,upload_day,upload_time_of_day,viewCount,likeCount,dislikeCount,favoriteCount,
commentCount,duration,definition,captions,licensedContent,thumbnail_url, thumbnail_w, thumbnail_h, tags,num_tags,categoryId,liveBroadcastContent,
defaultAudioLanguage,topicCategories, channel, channel_subs, channel_views, channel_videos,desc])
df = pd.DataFrame(stats)
df.columns = ['video_id','title','title_len','words','upper_pct','emoji_count','upload_date','upload_time','upload_day','upload_time_of_day','viewCount','likeCount','dislikeCount',
'favoriteCount','commentCount','duration','definition','caption','licensedContent','thumbnail_url', 'thumbnail_w', 'thumbnail_h', 'tags','num_tags',
'categoryId','liveBroadcastContent','defaultAudioLanguage','topicCategories', 'channel', 'channel_subs', 'channel_views', 'channel_videos','desc']
df.to_csv(file.replace('json','txt'))
Functions to Query Google Vision API
import httplib2
import sys
from googleapiclient import discovery
from oauth2client import tools, file, client
import json
import os
import cv2
from base64 import b64encode
import numpy as np
# limited preview only (sorry!)
API_DISCOVERY_FILE = os.environ.get('GOOGLE_VISION_API')
""" Google Authentication Utilities """
def get_vision_api():
credentials = get_api_credentials('https://www.googleapis.com/auth/cloud-platform')
with open(API_DISCOVERY_FILE, 'r') as f:
doc = f.read()
return discovery.build_from_document(doc, credentials=credentials, http=httplib2.Http())
def get_api_credentials(scope, service_account=True):
""" Build API client based on oAuth2 authentication """
# STORAGE = file.Storage(os.environ.get('GOOGLE_VISION_API')) #local storage of oAuth tokens
STORAGE = file.Storage(API_DISCOVERY_FILE) #local storage of oAuth tokens
credentials = STORAGE.get()
if credentials is None or credentials.invalid: #check if new oAuth flow is needed
if service_account: #server 2 server flow
# with open(os.environ.get('GOOGLE_VISION_API')) as f:
with open(API_DISCOVERY_FILE) as f:
account = json.loads(f.read())
email = account['client_email']
key = account['private_key']
credentials = client.SignedJwtAssertionCredentials(email, key, scope=scope)
STORAGE.put(credentials)
else: #normal oAuth2 flow
CLIENT_SECRETS = os.path.join(os.path.dirname(__file__), 'client_secrets.json')
FLOW = client.flow_from_clientsecrets(CLIENT_SECRETS, scope=scope)
PARSER = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, parents=[tools.argparser])
FLAGS = PARSER.parse_args(sys.argv[1:])
credentials = tools.run_flow(FLOW, STORAGE, FLAGS)
return credentials
""" read/write utilities """
def read_image(filename):
return cv2.imread(filename)
def save_image(filename, im):
cv2.imwrite(filename, cv2.cvtColor(im, cv2.COLOR_RGB2BGR))
def read_image_base64(filename):
with open(filename, 'rb') as f:
return b64encode(f.read())
""" OpenCV drawing utilities """
def draw_face(im, annotations):
faces = []
for a in annotations:
if a['detectionConfidence'] > .4:
try:
tl_,br_ = draw_box(im, a['fdBoundingPoly']['vertices'])
except:
tl_,br_=None
try:
joy = a['joyLikelihood']
except:
joy=''
try:
sad = a['sorrowLikelihood']
except:
sad=''
try:
angry = a['angerLikelihood']
except:
angry = ''
try:
suprise=a['surpriseLikelihood']
except:
suprise=''
emotions=[joy,sad,angry,suprise]
if 'VERY_LIKELY' in emotions:
emotion = emotions.index('VERY_LIKELY')
elif 'LIKELY' in emotions:
emotion = emotions.index('LIKELY')
elif 'POSSIBLE' in emotions:
emotion = emotions.index('POSSIBLE')
else:
emotion=None
if emotion==0:
text= "happy"
elif emotion==1:
text="sad"
elif emotion==2:
text="angry"
elif emotion==3:
text="suprised"
else:
text="other"
faces.append(text)
if im is not None and tl_ is not None:
draw_text(im, text ,tl_)
try:
for landmark in a['landmarks']:
if im is not None:
try:
draw_point(im, landmark['position'])
except:
pass
except:
pass
return faces
def extract_vertices(vertices):
""" Extract two opposite vertices from a list of 4 (assumption: rectangle) """
min_x,max_x,min_y,max_y = float("inf"),float("-inf"),float("inf"),float("-inf")
for v in vertices:
if v.get('x',min_y) < min_x:
min_x = v.get('x')
if v.get('x',max_y) > max_x:
max_x = v.get('x')
if v.get('y',min_y) < min_y:
min_y = v.get('y')
if v.get('y',max_y) > max_y:
max_y = v.get('y')
try:
v1 = next(v for v in vertices if v.get('x') == min_x and v.get('y') == min_y)
v2 = next(v for v in vertices if v.get('x') == max_x and v.get('y') == max_y)
except:
v1=None
v2=None
return v1,v2
def draw_box(im, vertices):
v1,v2 = extract_vertices(vertices)
try:
pt1 = (v1.get('x',0), v1.get('y',0))
pt2 = (v2.get('x',0), v2.get('y',0))
cv2.rectangle(im, pt1, pt2, (0,0,255),thickness=4)
except:
pt1=None
pt2=None
return pt1, pt2
def draw_point(im, position):
pt = (int(position.get('x',0)), int(position.get('y',0)))
cv2.circle(im, pt, 3, (0,0,255))
return pt
def draw_text(im, text,loc):
font_face = cv2.FONT_HERSHEY_SIMPLEX
#thickness = 1
thickness=round(0.002 * (im.shape[0] + im.shape[1]) / 2) + 10
# for scale in np.arange(20,0,-0.2):
# (w,h),baseline = cv2.getTextSize(text, font_face, scale, thickness)
# if w <= im.shape[1]:
# new_img = cv2.copyMakeBorder(im, 0, baseline*4, 0, 0, cv2.BORDER_CONSTANT, value=0)
# cv2.putText(new_img, text, (baseline*2 +20 ,new_img.shape[0]-baseline +20 ), font_face, 2, (255,255,255), thickness)
# return new_img
new_img = im
cv2.putText(new_img, text, loc, font_face, 2.5, (102,255,0), thickness)
return new_img
Google Vision API Data Pull
from datetime import date
import datetime
import json
from webbrowser import get
from google.cloud.vision_v1.types.image_annotator import AnnotateImageRequest, AnnotateImageResponse
from numpy.core.fromnumeric import shape
from numpy.core.numeric import NaN
from numpy.lib.arraysetops import unique
from skimage.util import dtype
from functions_for_google_vision_api import (get_vision_api, read_image, read_image_base64, save_image, draw_face, draw_box, draw_text)
from skimage import io
import os
from google.cloud import vision_v1
from google.cloud import vision
from google.cloud.vision_v1 import types
import cv2
import pandas as pd
import numpy as np
import itertools
import time
import random
#####################################################################
import httplib2
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.environ.get('GOOGLE_VISION_API')
DISCOVERY_URL='https://{api}.googleapis.com/$discovery/rest?version={apiVersion}'
def get_vision_service():
credentials = GoogleCredentials.get_application_default()
return discovery.build('vision', 'v1', credentials=credentials, discoveryServiceUrl = DISCOVERY_URL)
def main(video_id, inputfile):
service = get_vision_service()
outputfile= "C:/Users/merre/Desktop/ws/data/youtube_jadoinstuff/output_images/thumbnail_" +inputfile[inputfile.rfind('/', 0, inputfile.rfind('/'))+1:inputfile.rfind('/')] + ".jpg"
batch_request=[
{
"features": [
{
"maxResults": 50,
"type": "FACE_DETECTION"
},
{
"maxResults": 50,
"type": "LABEL_DETECTION"
},
{
"maxResults": 20,
"type": "SAFE_SEARCH_DETECTION"
},
{
"maxResults": 50,
"type": "TEXT_DETECTION"
}
],
"image": {
"source": {
"imageUri": inputfile
}
}
}
]
request = service.images().annotate(body={
'requests': batch_request,
})
time.sleep(random.random()*4)
response = request.execute()
inputfile,labels,faces,texts,adult,medical,racy,spoof,violence = show_results(inputfile, response, outputfile)
vars_list = [video_id,inputfile,labels,faces,texts,adult,medical,racy,spoof,violence]
i=0
for v in vars_list:
if type(v) == np.ndarray:
v = v.tolist()
vars_list[i]=v
i += 1
return vars_list
def show_results(inputfile, data, outputfile):
#read original file
im = io.imread(inputfile)
#draw face, boxes and text for each response
faces=[]
labels=[]
texts=[]
#dict_keys = data.keys()
for r in data['responses']:
if 'faceAnnotations' in r:
faces = draw_face(im, r['faceAnnotations'])
if 'labelAnnotations' in r:
for label in r['labelAnnotations']:
if label['score'] > .6:
try:
labels.append(label['description'])
except:
labels=labels
if 'textAnnotations' in r:
for a in r['textAnnotations']:
if a['description'] != '':
try:
texts.append(a['description'])
except:
texts=texts
if 'safeSearchAnnotation' in r:
try:
adult = r['safeSearchAnnotation']["adult"]
except:
adult=''
try:
medical = r['safeSearchAnnotation']["medical"]
except:
medical=''
try:
racy = r['safeSearchAnnotation']["racy"]
except:
racy=''
try:
spoof = r['safeSearchAnnotation']["spoof"]
except:
spoof=''
try:
violence = r['safeSearchAnnotation']["violence"]
except:
violence=''
labels=unique(labels)
texts=unique(texts)
#save to output file
save_image(outputfile, im)
return inputfile,labels,faces,texts,adult,medical,racy,spoof,violence
# files= ["shanieology.txt","simau.txt","soundslikepizza.txt","azerrz.txt","BigShade.txt","black_gryph0n.txt"
# ,"brian_hull.txt","brizzy_voices.txt","brock_baker.txt","charlie_hopkinson.txt","danny_padilla_&_mason_sperling.txt"
# ,"ja_doin_stuff.txt","joshiiwuh.txt","knep.txt","ori.txt","redfireball555.txt","scheiffer_bates.txt","daniel_ferguson.txt",
# "BigShade.txt","best_in_class.txt","maxamili.txt","mikey_bolts.txt"]
files= ["daniel_ferguson.txt","BigShade.txt","best_in_class.txt","maxamili.txt","mikey_bolts.txt"]
vid_ids=[]
vid_thumb_urls=[]
for file in files:
videos_loop= pd.read_csv(file)
vid_ids.append(list(videos_loop[pd.to_datetime(videos_loop["upload_date"])>datetime.datetime(2012,7,1,0,0,0,0)]["video_id"]))
vid_thumb_urls.append(list(videos_loop[pd.to_datetime(videos_loop["upload_date"])>datetime.datetime(2012,7,1,0,0,0,0)]["thumbnail_url"]))
vid_ids=list(itertools.chain(*vid_ids))
vid_thumb_urls=list(itertools.chain(*vid_thumb_urls))
df = pd.DataFrame(columns=['video_id','thumbnail_url', 'labels','faces','texts','adult','medical','racy','spoof','violence'])
ii = 0
for i in range(len(vid_ids)):
if vid_thumb_urls[i] is not NaN:
time.sleep(5)
try:
df.loc[len(df)] = main(video_id=vid_ids[i],inputfile=vid_thumb_urls[i])
ii += 1
except:
pass
if ii % 30 == 0 or i==len(vid_ids)-1 or i==len(vid_ids):
df.to_csv('thumbnail_data_'+str(datetime.datetime.now()).replace('-','').replace(' ','_').replace(':','-')+'.txt', header=True, index=None, mode='w')
print("Num videos",i,"---- Percent complete:",(round(i/len(vid_ids),3))*100)
The Google API uses deep learning to identify number of faces, facial expressions and the text contained in the thumbnail image of each youtube video. An example is shown below of the facial recognition. The google vision also accurately identifies any text inside the thunbnail image. The thumbnail below shows that the deep learning algorithim correctly identified two suprised faces and two happy faces.
![]()
I combined the YouTube data with the thumbnail image data for each channel, including nearly 100 videos for each channel.
os.chdir("C:\\Users\\merre\\Desktop\\ws\\data\\youtube_jadoinstuff")
files= ["shanieology.txt","simau.txt","soundslikepizza.txt","azerrz.txt","BigShade.txt","black_gryph0n.txt"
,"brian_hull.txt","brizzy_voices.txt","brock_baker.txt","charlie_hopkinson.txt","danny_padilla_&_mason_sperling.txt"
,"ja_doin_stuff.txt","joshiiwuh.txt","knep.txt","ori.txt","redfireball555.txt","scheiffer_bates.txt","daniel_ferguson.txt",
"BigShade.txt","best_in_class.txt","maxamili.txt","mikey_bolts.txt"]
df = pd.DataFrame(columns=['video_id','title','title_len','words','upper_pct','emoji_count','upload_date','upload_time',
'upload_day','upload_time_of_day','viewCount','likeCount','dislikeCount','favoriteCount',
'commentCount','duration','definition','caption','licensedContent','thumbnail_url', 'thumbnail_w',
'thumbnail_h', 'tags','num_tags','categoryId','liveBroadcastContent',
'topicCategories', 'channel', 'channel_subs', 'channel_views', 'channel_videos','desc'])
#Loop through all the youtuber's data files and combine into on data frame
for file in files:
df_add= pd.read_csv(file)
df = df.append(df_add.drop(['Unnamed: 0'],axis=1))
#Read in the files that have the thumbnail data, and combine with the youtuber data
df_thumb = pd.read_csv("thumbnail_data_20210801_23-03-21.638854.txt").append(pd.read_csv("thumbnail_data_20210802_21-39-16.451172.txt"))
df_all = pd.merge(df.drop('defaultAudioLanguage',axis=1),df_thumb.drop(['thumbnail_url'],axis=1),on="video_id",how="inner").drop(['emoji_count'],axis=1)
df_all['all_text'] = df_all['title'].astype(str) + df_all['tags'].astype(str) + df_all['desc'].astype(str)
To get an overview of the data, let's check the first rows and the size of the data set. We can see the data has 1,940 rows and 40 columns.
df_all
There are 1940 YouTube videos for the analysis and 40 features.
print(df_all.isna().mean().round(4))
The variable topicCategories is missing almost 6% of its values. Imputation may be difficult for these categories, but I will combine similar categories and create new fields. Below are all the different topics.
unique, counts = np.unique(np.array([y for x in df['topicCategories'] for y in re.findall(r'/[\w_-]+\'',str(x).upper()) ]), return_counts=True)
dic= dict(zip(unique, counts))
dic2=dict(sorted(dic.items(),key= lambda x:x[1],reverse=True))
print(dic2)
We will combine all of the topics above and create new columns called topic_entertain,topic_video_game,topic_music, and topic_film_tv. After that we will drop topicCategories as we now have new fields and imputing the missing values does not work. We will print off the new shape of the dataset to see how many rows we lost.
df_all["topic_entertain"] = [1 if any([str(x).upper().find(y) >-1 for y in ['ENTERTAINMENT']]) else 0 for x in df_all['topicCategories']]
df_all["topic_video_game"] = [1 if any([str(x).upper().find(y) >-1 for y in ['_GAME']]) else 0 for x in df_all['topicCategories']]
df_all["topic_music"] = [1 if any([str(x).upper().find(y) >-1 for y in ['MUSIC']]) else 0 for x in df_all['topicCategories']]
df_all["topic_film_tv"] = [1 if any([str(x).upper().find(y) >-1 for y in ['FILM','TELEVISION']]) else 0 for x in df_all['topicCategories']]
#Now we can drop defaultAudioLanguage since we have a replacement field for that
df_all = df_all.drop('topicCategories',axis=1)
#We can now remove all rows that have missiong values since the rows with the most missing values are gone
df_all = df_all.dropna()
df_all = df_all.reset_index()
print(shape(df_all))
3.2. Creating Additional Variables
Now we will create additional variables that may be usefull to predicitng the success of the YouTube video. The following uses each videos title, description, and tags to create new fields to analyze each video's text. Regular expressions are used several times to parse out words and other special character such as emojis from the text.
all_labels = list(df_all["labels"])
all_labels = [ x.split(',') for x in df_all["labels"]]
all_labels = [[re.sub(r'[^a-zA-Z\u00C0-\u00FF\s]', " ",i).strip(' \t\n\r').upper() for i in ii] for ii in all_labels]
all_labels_flat = list(set([item for elem in all_labels for item in elem]))
df_all['labels_words'] = all_labels
#Scans the title for any emojis in general
df_all["emoji_count"]= [(emoji.emoji_count(x)>0)*1 for x in df_all["title"]]
###Create variables to see how well title reflects the description, thumbnail and tags of the video
#The text read in by the ggogle vision text detection is messy.
#Need to clean up and create new variable "thumb_words", thumb_word_count"
df_all['labels_word_count']=[len(df_all['labels_words'][i]) for i in range(len(df_all))]
discard_list = ["","B",'C','D','','F','G','H','','J','K','L','M','N','','P','Q','R','S','T','V','W','X','Z']
df_all['thumb_words']=[list(set(re.sub(r'[^a-zA-Z\u00C0-\u00FF]', " ",str(re.findall(r'[^\\n][\s\?\!\"\']+[a-zA-Z\u00C0-\u00FF]+[\s\.\?\!\"\']?',str(df_all['texts'][i])))).upper().split(" "))) for i in range(len(df_all))]
for i in range(len(df_all)):
test_list = df_all['thumb_words'][i]
remove_list = discard_list
df_all['thumb_words'][i] = [i for i in test_list if i not in remove_list]
df_all['thumb_word_count']=[len(df_all['thumb_words'][i]) for i in range(len(df_all))]
df_all['title_in_desc']=[((df_all['desc'][i].upper().find(df_all['title'][i].upper()))>-1)*1 for i in range(len(df_all))]
df_all['thumb_words_in_title']=[ sum([(str(df_all['title'][z]).upper().find(y)>-1)*1 for y in df_all['thumb_words'][z]]) for z in range(len(df_all['title']))]
df_all['thumb_words_in_tags']=[ sum([(str(df_all['tags'][z]).upper().find(y)>-1)*1 for y in df_all['thumb_words'][z]]) for z in range(len(df_all['tags']))]
df_all['label_words_in_title']=''
df_all['label_words_in_tags']=''
for z in range(len(df_all['title'])):
df_all['label_words_in_title'][z]=sum([(str(df_all['title'][z]).upper().find(y)>-1)*1 for y in df_all['labels_words'][z]])
df_all['label_words_in_tags'][z]=sum([(str(df_all['tags'][z]).upper().find(y)>-1)*1 for y in df_all['labels_words'][z]])
##See how many faces are in the thumbnails
df_all['faces_surprised']=[sum([(str(y)=='suprised')*1 for y in re.findall(r'[a-zA-Z\u00C0-\u00FF]+',df_all['faces'][z])]) for z in range(len(df_all['faces']))]
df_all['faces_angry']=[sum([(str(y)=='angry')*1 for y in re.findall(r'[a-zA-Z\u00C0-\u00FF]+',df_all['faces'][z])]) for z in range(len(df_all['faces']))]
df_all['faces_happy']=[sum([(str(y)=='happy')*1 for y in re.findall(r'[a-zA-Z\u00C0-\u00FF]+',df_all['faces'][z])]) for z in range(len(df_all['faces']))]
df_all['faces_other']=[sum([(str(y)=='other')*1 for y in re.findall(r'[a-zA-Z\u00C0-\u00FF]+',df_all['faces'][z])]) for z in range(len(df_all['faces']))]
Now we will create variables that measure how much people enjoyed the videos. More likes, comments and a higher like to dislike percentage mean people enjoyed the video more. We also create views_per_sub (number of views divided by the number of subscribers) which will be used as the response variable in this study.
df_all['likes_views_ratio']=df_all['likeCount']/df_all['viewCount']
df_all['likes_subs_ratio']=df_all['likeCount']/df_all['channel_subs']
df_all['comment_views_ratio']=df_all['commentCount']/df_all['viewCount']
try:
df_all['comment_likes_ratio']=df_all['commentCount']/df_all['likeCount']
except:
df_all['comment_likes_ratio']=df_all['commentCount'].mean()/df_all['likeCount'].mean()
df_all['comment_subs_ratio']=df_all['commentCount']/df_all['channel_subs']
df_all['views_favorite_ratio']=df_all['favoriteCount']/df_all['viewCount']
df_all['like_percent']=df_all['likeCount']/(df_all['likeCount']+df_all['dislikeCount'])
max_date=max(df_all['upload_date'])
df_all['days_since_upload']= (pd.to_datetime(max_date) - pd.to_datetime(df_all['upload_date'])).dt.days +1
df_all['views_per_sub'] = df_all['viewCount']/df_all['channel_subs']
In the chart below we can explore views_per_sub on the x axis and the likes_per_sub_ratio on the y axis. We can select the number of channel subscribers (in thousands) from the dro down menu.
channel_subs = list(df_all["channel_subs"].sort_values().unique()/1000)
# use specific hard-wired values as the initial selected values
selection = alt.selection_single(
name='Select',
fields=['channel_subs'],
init={'channel_subs': 387},
bind={'channel_subs': alt.binding_select(options=channel_subs)}
)
# scatter plot, modify opacity based on selection
alt.Chart(df_all[df_all["views_per_sub"]<5]).mark_circle().add_selection(
selection
).encode(
x='views_per_sub:Q',
y='likes_subs_ratio:Q',
tooltip='Title:N',
opacity=alt.condition(selection, alt.value(0.85), alt.value(0.15))
)